# coding=utf-8
import codecs
import csv
import logging
import re
import sys

import os

reload(sys);
sys.setdefaultencoding("utf-8");

# import PyPDF2
#
# # creating an object
# file = open('D:/pics2/echinalife222/wwwe-chinalifecomIRchannelfilesChangfunengli20170427_1pdf.pdf', 'rb')
# # file = open('D:/aaa.pdf', 'rb')
# # creating a pdf reader object
# fileReader = PyPDF2.PdfFileReader(file, strict=True, warndest=None, overwriteWarnings=True)
#
# # print the number of pages in pdf file
# # print(fileReader.numPages)
# print(fileReader.getPage(5).extractText());

out = open('D:/03.Documents/echinalife.csv', 'wb')
out.write(codecs.BOM_UTF8)
csv_write = csv.writer(out, dialect='excel')

header = [u'项目名称', u'上一季度', u'当前季度']
csv_write.writerow(header)
keywords = [u'認可資產', u'認可負債', u'實際資本', u'認可資產（人民幣萬元）', u'認可負債（人民幣萬元）', u'實際資本（人民幣萬元）']


def process(i, path):
    with codecs.open(path, 'r', encoding='utf-8') as f:
        content = f.read()
        for keyw in keywords:
            w = re.findall(keyw+'\s*[\d,.]+\s+[\d,.]+', content)
            if len(w) > 0:
                w1 = w[0].decode('utf-8')
                a = w1.split()
                csv_write.writerow(a)
            else:
                raise Exception("keyword %s not found..." % keyw)
    pass


def process_dir(path):
    rootdir = path
    list = os.listdir(unicode(rootdir, 'utf-8'))  # 列出文件夹下所有的目录与文件
    for i in range(0, len(list)):
        row = [list[i]]
        csv_write.writerow(row)
        path = os.path.join(rootdir, list[i])
        if os.path.isfile(path):
            try:
                process(i, path)
            except BaseException, ex:
                logging.error(ex)


process_dir('D:/03.Documents/echinalife_file')